import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from timeit import default_timer as timer
from missingpy import MissForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pprint import pprint
from scipy import stats
from matplotlib import pyplot
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", RuntimeWarning)
df = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/2016_Building_Energy_Benchmarking.csv"
)
df_energy = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/df_energy.csv"
).drop(columns="Unnamed: 0")
y_energy = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/y_energy.csv"
).drop(columns="Unnamed: 0")
df_energy.describe().T.style.background_gradient(
subset=["mean"], cmap="coolwarm"
).background_gradient(subset=["std"], cmap="coolwarm").background_gradient(
subset=["50%"], cmap="coolwarm"
)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| BuildingAge | 3340.000000 | 0.408825 | 0.285537 | 0.008621 | 0.163793 | 0.353448 | 0.586207 | 1.000000 |
| PropertyGFATotal | 3340.000000 | 0.754311 | 0.059207 | 0.644223 | 0.707917 | 0.738140 | 0.787623 | 1.000000 |
| PropertyGFABuilding(s) | 3340.000000 | 0.755898 | 0.057183 | 0.569975 | 0.710993 | 0.741819 | 0.788294 | 1.000000 |
| LargestPropertyUseTypeGFA | 3340.000000 | 0.750707 | 0.058821 | 0.602759 | 0.706489 | 0.738361 | 0.783169 | 1.000000 |
| SteamUse | 3340.000000 | 0.038024 | 0.191283 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| NaturalGasUse | 3340.000000 | 0.629341 | 0.483054 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| ElectricityUse | 3340.000000 | 0.629341 | 0.483054 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| ENERGYSTARScore | 3340.000000 | 0.667212 | 0.242078 | 0.010000 | 0.540000 | 0.700000 | 0.860000 | 1.000000 |
| T_Distribution Center | 3340.000000 | 0.015868 | 0.124985 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_High-Rise Multifamily | 3340.000000 | 0.031138 | 0.173716 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Hospital | 3340.000000 | 0.002994 | 0.054644 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Hotel | 3340.000000 | 0.023054 | 0.150097 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_K-12 School | 3340.000000 | 0.037425 | 0.189830 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Laboratory | 3340.000000 | 0.002994 | 0.054644 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Large Office | 3340.000000 | 0.050599 | 0.219210 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Low-Rise Multifamily | 3340.000000 | 0.294311 | 0.455801 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Medical Office | 3340.000000 | 0.011677 | 0.107442 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Mid-Rise Multifamily | 3340.000000 | 0.168563 | 0.374421 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Mixed Use Property | 3340.000000 | 0.039521 | 0.194860 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Office | 3340.000000 | 0.000898 | 0.029961 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Other | 3340.000000 | 0.075749 | 0.264635 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Refrigerated Warehouse | 3340.000000 | 0.003593 | 0.059841 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Residence Hall | 3340.000000 | 0.006886 | 0.082709 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Restaurant | 3340.000000 | 0.003593 | 0.059841 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Retail Store | 3340.000000 | 0.027246 | 0.162822 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Self-Storage Facility | 3340.000000 | 0.008383 | 0.091189 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Senior Care Community | 3340.000000 | 0.013473 | 0.115306 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Small- and Mid-Sized Office | 3340.000000 | 0.087126 | 0.282061 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Supermarket / Grocery Store | 3340.000000 | 0.011976 | 0.108794 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_University | 3340.000000 | 0.006287 | 0.079055 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Warehouse | 3340.000000 | 0.055988 | 0.229933 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Worship Facility | 3340.000000 | 0.020659 | 0.142260 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Campus | 3340.000000 | 0.006587 | 0.080904 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Multifamily HR (10+) | 3340.000000 | 0.032635 | 0.177705 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Multifamily LR (1-4) | 3340.000000 | 0.303293 | 0.459750 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Multifamily MR (5-9) | 3340.000000 | 0.173353 | 0.378609 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_NonResidential | 3340.000000 | 0.433234 | 0.495596 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| T_Nonresidential COS | 3340.000000 | 0.025449 | 0.157508 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_Nonresidential WA | 3340.000000 | 0.000299 | 0.017303 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| T_SPS-District K-12 | 3340.000000 | 0.025150 | 0.156603 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
X_train_energy, X_test_energy, y_train_energy, y_test_energy = train_test_split(
df_energy,
np.ravel(y_energy),
test_size=0.3, # 30% des données dans le jeu de test
shuffle=True)
from autofeat import AutoFeatModel, AutoFeatRegressor, FeatureSelector
AutoFeat est une librairie qui execute des opérations pré-définies sur les colonnes afin d'affiner la capacité prédictive d'un modèle de régression. L'algorithme calcule de nombreuses transformations et combinaisons de features. Ensuite choisis les plus pertinentes en suivant des critères de régularisation, et en fonction du r^2 d'une régression linéaire simple. Nous nous limitions à 2 étapes ici, afin d'éviter tout overfit.
for steps in range(4):
np.random.seed(55)
print("### AutoFeat with %i feateng_steps" % steps)
afreg = AutoFeatRegressor(verbose=1,
feateng_steps=steps,
n_jobs=-1,
max_gb=4,
transformations=("exp", "abs", "sqrt", "^2",
"^3")
)
df_train_energy = afreg.fit_transform(
X_train_energy, np.ravel(y_train_energy))
df_test_energy = afreg.transform(X_test_energy)
r2 = afreg.score(X_test_energy, np.ravel(y_test_energy))
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(X_test_energy), np.ravel(y_test_energy), s=2)
plt.title("%i FE steps (R^2: %.4f; %i new features)" %
(steps, r2, len(afreg.new_feat_cols_)))
### AutoFeat with 0 feateng_steps [AutoFeat] The 0 step feature engineering process could generate up to 40 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.00 gb of space. [feateng] Warning: no features generated for max_steps < 1. [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 4.9s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 4.9s remaining: 7.4s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 4.9s remaining: 3.3s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.9s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.9s finished [featsel] 28 features after 5 feature selection runs [featsel] 25 features after correlation filtering [featsel] 20 features after noise filtering [AutoFeat] Final dataframe with 40 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.770147460598833 13.545075 * PropertyGFATotal 1.349277 * T_Laboratory -1.294952 * T_Self-Storage Facility 1.076086 * T_Supermarket / Grocery Store -0.987442 * ENERGYSTARScore 0.947982 * T_Hospital -0.874325 * T_Distribution Center -0.822019 * T_Warehouse 0.764730 * T_Restaurant 0.603810 * T_Campus -0.545386 * T_Worship Facility 0.444043 * ElectricityUse 0.403221 * T_Nonresidential COS 0.271288 * T_Senior Care Community 0.258322 * SteamUse 0.229232 * T_Other 0.214557 * T_NonResidential 0.190874 * T_Large Office -0.156488 * T_Low-Rise Multifamily -0.133164 * T_Multifamily MR (5-9) [AutoFeat] Final score: 0.8027 ## Final R^2: 0.8339 ### AutoFeat with 1 feateng_steps [AutoFeat] The 1 step feature engineering process could generate up to 200 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.00 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Generated altogether 20 new features in 1 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 2 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.2s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.3s remaining: 0.4s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.3s remaining: 0.2s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.2s finished [featsel] 27 features after 5 feature selection runs [featsel] 24 features after correlation filtering [featsel] 23 features after noise filtering [AutoFeat] Final dataframe with 40 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.760604391324167 13.550456 * PropertyGFATotal 1.412822 * T_Laboratory -1.230595 * T_Self-Storage Facility 1.139189 * T_Supermarket / Grocery Store 1.017675 * T_Hospital -0.991370 * ENERGYSTARScore 0.826893 * T_Restaurant -0.810955 * T_Distribution Center -0.758619 * T_Warehouse 0.577357 * T_Campus -0.481074 * T_Worship Facility 0.445952 * ElectricityUse 0.363597 * T_Refrigerated Warehouse 0.356740 * T_Nonresidential COS 0.302687 * T_Senior Care Community 0.289551 * T_Other 0.257416 * T_Large Office 0.242017 * SteamUse 0.187155 * T_Hotel 0.157149 * T_NonResidential -0.148707 * T_Low-Rise Multifamily -0.126397 * T_Multifamily MR (5-9) 0.089903 * T_Small- and Mid-Sized Office [AutoFeat] Final score: 0.8037 ## Final R^2: 0.8327 ### AutoFeat with 2 feateng_steps [AutoFeat] The 2 step feature engineering process could generate up to 20100 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.19 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 1271 feature combinations from 1770 original feature tuples - done. [feateng] Generated altogether 1785 new features in 2 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 403 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.6s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.7s remaining: 1.0s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.7s remaining: 0.5s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.9s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.9s finished [featsel] 50 features after 5 feature selection runs [featsel] 40 features after correlation filtering [featsel] 32 features after noise filtering [AutoFeat] Computing 17 new features. [AutoFeat] 17/ 17 new features ...done. [AutoFeat] Final dataframe with 57 feature columns (17 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.424390165081158 13.454786 * PropertyGFATotal 2.739004 * BuildingAge**3*T_Hospital -1.862610 * T_LowRiseMultifamily*T_NonResidential 1.787341 * ENERGYSTARScore**3*T_Other 1.451768 * SteamUse*T_MixedUseProperty 1.372962 * T_Laboratory -1.345601 * BuildingAge**3*T_Other -1.317675 * T_Self-Storage Facility 1.302402 * BuildingAge*T_SeniorCareCommunity 1.116880 * T_Supermarket / Grocery Store -1.000535 * BuildingAge**2*T_Warehouse 0.796742 * SteamUse*T_Hospital -0.716430 * ENERGYSTARScore*T_DistributionCenter 0.692612 * T_Restaurant 0.656902 * T_Campus 0.576915 * sqrt(BuildingAge)*PropertyGFATotal**3 0.546611 * T_NonresidentialCOS*T_SmallandMidSizedOffice -0.542429 * ENERGYSTARScore*T_Warehouse -0.542373 * T_Distribution Center -0.537186 * ENERGYSTARScore 0.517627 * T_Nonresidential COS 0.496304 * T_NonResidential -0.473476 * T_Worship Facility -0.361730 * ENERGYSTARScore*T_NonResidential 0.305293 * sqrt(BuildingAge)*NaturalGasUse -0.276234 * T_Warehouse 0.266087 * ElectricityUse 0.230773 * SteamUse*T_NonResidential 0.211394 * T_Large Office -0.171861 * ENERGYSTARScore**2*T_WorshipFacility -0.160515 * ENERGYSTARScore**3*exp(BuildingAge) -0.082607 * T_Low-Rise Multifamily [AutoFeat] Final score: 0.8271 [AutoFeat] Computing 17 new features. [AutoFeat] 17/ 17 new features ...done. [AutoFeat] Computing 17 new features. [AutoFeat] 17/ 17 new features ...done. ## Final R^2: 0.8376 [AutoFeat] Computing 17 new features. [AutoFeat] 17/ 17 new features ...done. ### AutoFeat with 3 feateng_steps [AutoFeat] The 3 step feature engineering process could generate up to 338500 features. [AutoFeat] With 2338 data points this new feature matrix would use about 3.17 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 20 transformed features from 40 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 6578 feature combinations from 1770 original feature tuples - done. [feateng] Step 3: transformation of new features [feateng] Generated 21220 transformed features from 6578 original features - done. [feateng] Generated altogether 29347 new features in 3 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 8033 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 18.7s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 20.2s remaining: 30.3s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 21.5s remaining: 14.3s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 28.9s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 28.9s finished [featsel] 59 features after 5 feature selection runs [featsel] 30 features after correlation filtering [featsel] 24 features after noise filtering [AutoFeat] Computing 23 new features. [AutoFeat] 23/ 23 new features ...done. [AutoFeat] Final dataframe with 63 feature columns (23 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 9.45728786373975 2.780273 * (-PropertyGFATotal + T_SelfStorageFacility)**2 -2.408298 * (-LargestPropertyUseTypeGFA + T_Warehouse)**3 2.012431 * ENERGYSTARScore**6*T_Other**3 -1.962587 * (LargestPropertyUseTypeGFA**3 + T_Laboratory)**2 -1.959891 * (-LargestPropertyUseTypeGFA + T_DistributionCenter)**3 -1.821432 * (sqrt(ENERGYSTARScore) - PropertyGFATotal**3)**3 1.315281 * (LargestPropertyUseTypeGFA - T_WorshipFacility)**3 1.241841 * PropertyGFATotal -1.147940 * ElectricityUse**2*PropertyGFATotal**6 -1.121147 * Abs(LargestPropertyUseTypeGFA**2 - T_Warehouse) 0.995564 * Abs(LargestPropertyUseTypeGFA**3 - T_NonresidentialCOS) 0.739478 * (NaturalGasUse + T_Laboratory)**3 -0.727288 * (sqrt(ENERGYSTARScore) - LargestPropertyUseTypeGFA)**3 0.625570 * Abs(T_Campus - T_SupermarketGroceryStore) 0.579689 * exp(T_NonResidential - exp(ENERGYSTARScore)) 0.567573 * (-PropertyGFATotal**2 + T_MidRiseMultifamily)**2 0.527699 * PropertyGFATotal**6*T_NonResidential**3 0.450922 * Abs(PropertyGFATotal**3 - T_LargeOffice) 0.226571 * exp(-T_LowRiseMultifamily + T_SupermarketGroceryStore) -0.220247 * Abs(sqrt(ENERGYSTARScore) - SteamUse) -0.155932 * Abs(ElectricityUse - T_Warehouse) 0.148226 * exp(-ENERGYSTARScore**2 + T_LargeOffice) 0.026964 * exp(NaturalGasUse + SteamUse) 0.007468 * exp(NaturalGasUse + T_Other) [AutoFeat] Final score: 0.8188 [AutoFeat] Computing 23 new features. [AutoFeat] 23/ 23 new features ...done. [AutoFeat] Computing 23 new features. [AutoFeat] 23/ 23 new features ...done. ## Final R^2: 0.7996 [AutoFeat] Computing 23 new features. [AutoFeat] 23/ 23 new features ...done.
L'auto feature engineering n'apporte aucune valeur, elle est donc rejetée.
X_train_energy, X_test_energy, y_train_energy, y_test_energy = train_test_split(
df_energy,
np.ravel(y_energy),
test_size=0.3, # 30% des données dans le jeu de test
shuffle=True)
Les premiers modèles que nous allons essayer seront des modèles linéares.
from sklearn.linear_model import RidgeCV, LassoCV, Lasso, LassoLarsCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pprint import pprint
def test_model_lineaire_scaled(estimator, X_train, X_test, y_train, y_test):
estimators = [('standardize', StandardScaler()),
(type(estimator).__name__, estimator)]
model_ = Pipeline(estimators)
model_.fit(X_train, np.ravel(y_train))
r2 = model_.score(X_test, np.ravel(y_test))
y_pred = model_.predict(X_test)
y_train_to_test_overfit = model_.predict(X_train)
plt.figure()
plt.scatter(y_pred, np.ravel(y_test), s=2)
plt.title("%s (R^2: %.4f)" %
(type(model_[1]).__name__, r2))
print(type(model_[1]).__name__, "MSE: ",
mean_squared_error(np.ravel(y_test), y_pred))
print(type(model_[1]).__name__, "RMSE: ",
mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print(type(model_[1]).__name__, "alpha", model_[1].alpha_)
importance = model_[1].coef_
print("Score MSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train))
print("Score RMSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score r2 model sur le jeu d'entrainement", r2_score(
np.ravel(y_train_to_test_overfit), y_train))
fig, ax = plt.subplots(figsize=(27, 10), dpi=300)
ax.set_xticks([x for x in range(len(importance))])
ax.set_xticklabels(X_train.columns, rotation=90)
ax.bar(X_train.columns, importance)
plt.show()
return
for model in [
RidgeCV(alphas=np.linspace(0.0001, 200, 2000), scoring='r2'),
LassoCV(alphas=np.linspace(0.0001, 10, 2000), max_iter=10000),
ElasticNetCV(l1_ratio=np.linspace(0.0001, 1, 200),
max_iter=2000,
n_alphas=200)
]:
test_model_lineaire_scaled(
model, X_train_energy, X_test_energy, y_train_energy, y_test_energy)
RidgeCV MSE: 0.21192849557648527 RidgeCV RMSE: 0.4904732425027816 RidgeCV alpha 47.82398804402201 Score MSE model sur le jeu d'entrainement 0.24056400161119246 Score RMSE model sur le jeu d'entrainement 0.4904732425027816 Score r2 model sur le jeu d'entrainement 0.7687949129562001
LassoCV MSE: 0.2118608775978164 LassoCV RMSE: 0.49005335525780275 LassoCV alpha 0.0001 Score MSE model sur le jeu d'entrainement 0.24015229099943022 Score RMSE model sur le jeu d'entrainement 0.49005335525780275 Score r2 model sur le jeu d'entrainement 0.7730358332710681
ElasticNetCV MSE: 0.21186867318410704 ElasticNetCV RMSE: 0.49074628802356607 ElasticNetCV alpha 0.0221226673467136 Score MSE model sur le jeu d'entrainement 0.24083191920890884 Score RMSE model sur le jeu d'entrainement 0.49074628802356607 Score r2 model sur le jeu d'entrainement 0.7672971510803136
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
def print_results_xgboost(estimator, X_train, X_test, y_train, y_test):
estimator.fit(X_train, y_train)
r2 = estimator.score(X_test, y_test)
y_pred = estimator.predict(X_test)
y_train_to_test_overfit = estimator.predict(X_train)
plt.figure()
plt.scatter(estimator.predict(X_test), np.ravel(y_test), s=2)
plt.title("%s (R^2: %.4f)" %
(type(estimator).__name__, r2))
y_train_to_test_overfit = estimator.predict(X_train)
print("MSE: ",
mean_squared_error(np.ravel(y_test), y_pred))
print("RMSE: ",
mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score MSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train))
print("Score RMSE model sur le jeu d'entrainement", mean_squared_error(
np.ravel(y_train_to_test_overfit), y_train, squared=False))
print("Score r2 model sur le jeu d'entrainement", r2_score(
np.ravel(y_train_to_test_overfit), y_train))
print(estimator.best_params_)
#estimator_features = XGBRegressor(np.ravel(estimator.best_params_))
#estimator_features.fit(X_train, y_train)
importance = estimator.best_estimator_.feature_importances_
# summarize feature importance
fig, ax = plt.subplots(figsize=(27, 10), dpi=300)
ax.set_xticks([x for x in range(len(importance))])
ax.set_xticklabels(X_train.columns, rotation=90)
ax.bar(X_train.columns, importance)
plt.show()
return
xgb1 = XGBRegressor()
parameters = {'nthread': [0], # when use hyperthread, xgboost may become slower
'objective': ['reg:squarederror'],
'booster': ['gbtree'],
'learning_rate': [.03, 0.07, 0.5], # so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [2, 4],
'subsample': [0.3, 0.7],
'colsample_bytree': [0.7],
'n_estimators': [100, 200],
'reg_alpha': [0.01, 0.5],
'reg_lambda': [0.01, 0.5]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv=2,
scoring="r2",
verbose=1)
print_results_xgboost(xgb_grid, X_train_energy,
X_test_energy, y_train_energy, y_test_energy)
Fitting 2 folds for each of 288 candidates, totalling 576 fits
MSE: 0.19832187682140745
RMSE: 0.336835454213715
Score MSE model sur le jeu d'entrainement 0.11345812321535972
Score RMSE model sur le jeu d'entrainement 0.336835454213715
Score r2 model sur le jeu d'entrainement 0.8988850835726105
{'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 200, 'nthread': 0, 'objective': 'reg:squarederror', 'reg_alpha': 0.01, 'reg_lambda': 0.5, 'subsample': 0.3}
df_energy = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/df_energy.csv"
).drop(columns="Unnamed: 0")
y_energy = pd.read_csv(
"/Users/loicvalenti/Library/Mobile Documents/com~apple~CloudDocs/Formation Data Science/PROJET 4/Notebooks and files for pres/FILES NOTEBOOKS CSV/y_energy.csv"
).drop(columns="Unnamed: 0")
X_train_energy, X_test_energy, y_train_energy, y_test_energy = train_test_split(
df_energy.drop(columns=["ENERGYSTARScore"]),
np.ravel(y_energy),
test_size=0.3, # 30% des données dans le jeu de test
shuffle=True)
for steps in range(4):
np.random.seed(55)
print("### AutoFeat with %i feateng_steps" % steps)
afreg = AutoFeatRegressor(verbose=1,
feateng_steps=steps,
n_jobs=-1,
max_gb=4,
transformations=("exp", "abs", "sqrt", "^2",
"^3")
)
df_train_energy = afreg.fit_transform(
X_train_energy, np.ravel(y_train_energy))
df_test_energy = afreg.transform(X_test_energy)
r2 = afreg.score(X_test_energy, np.ravel(y_test_energy))
print("## Final R^2: %.4f" % r2)
plt.figure()
plt.scatter(afreg.predict(X_test_energy), np.ravel(y_test_energy), s=2)
plt.title("%i FE steps (R^2: %.4f; %i new features)" %
(steps, r2, len(afreg.new_feat_cols_)))
### AutoFeat with 0 feateng_steps [AutoFeat] The 0 step feature engineering process could generate up to 39 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.00 gb of space. [feateng] Warning: no features generated for max_steps < 1. [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 4.3s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 4.3s remaining: 6.5s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 4.3s remaining: 2.9s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.4s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.4s finished [featsel] 27 features after 5 feature selection runs [featsel] 23 features after correlation filtering [featsel] 23 features after noise filtering [AutoFeat] Final dataframe with 39 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.389793720280057 13.212339 * PropertyGFATotal 1.567704 * T_Laboratory 1.453937 * T_Hospital 1.367309 * T_Supermarket / Grocery Store -1.152061 * T_Self-Storage Facility 0.849787 * T_Restaurant 0.737246 * T_Campus -0.643524 * T_Warehouse -0.551443 * T_Worship Facility -0.519342 * T_Distribution Center 0.468229 * T_Refrigerated Warehouse 0.439464 * NaturalGasUse 0.421555 * T_Senior Care Community 0.386603 * T_Nonresidential COS 0.343379 * T_Medical Office 0.323337 * SteamUse -0.230672 * T_Multifamily MR (5-9) 0.211138 * T_Other 0.201648 * T_Hotel -0.194008 * T_Low-Rise Multifamily 0.155089 * T_Large Office 0.130430 * T_NonResidential 0.125740 * T_Mixed Use Property [AutoFeat] Final score: 0.7716 ## Final R^2: 0.7753 ### AutoFeat with 1 feateng_steps [AutoFeat] The 1 step feature engineering process could generate up to 195 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.00 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Generated altogether 16 new features in 1 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 1 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.2s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.2s remaining: 0.4s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.3s remaining: 0.2s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.5s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.5s finished [featsel] 26 features after 5 feature selection runs [featsel] 23 features after correlation filtering [featsel] 23 features after noise filtering [AutoFeat] Final dataframe with 39 feature columns (0 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.389793720280055 13.212339 * PropertyGFATotal 1.567704 * T_Laboratory 1.453937 * T_Hospital 1.367309 * T_Supermarket / Grocery Store -1.152061 * T_Self-Storage Facility 0.849787 * T_Restaurant 0.737246 * T_Campus -0.643524 * T_Warehouse -0.551443 * T_Worship Facility -0.519342 * T_Distribution Center 0.468229 * T_Refrigerated Warehouse 0.439464 * NaturalGasUse 0.421555 * T_Senior Care Community 0.386603 * T_Nonresidential COS 0.343379 * T_Medical Office 0.323337 * SteamUse -0.230672 * T_Multifamily MR (5-9) 0.211138 * T_Other 0.201648 * T_Hotel -0.194008 * T_Low-Rise Multifamily 0.155089 * T_Large Office 0.130430 * T_NonResidential 0.125740 * T_Mixed Use Property [AutoFeat] Final score: 0.7716 ## Final R^2: 0.7753 ### AutoFeat with 2 feateng_steps [AutoFeat] The 2 step feature engineering process could generate up to 19110 features. [AutoFeat] With 2338 data points this new feature matrix would use about 0.18 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 1002 feature combinations from 1485 original feature tuples - done. [feateng] Generated altogether 1497 new features in 2 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 266 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 0.5s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 0.5s remaining: 0.7s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 0.5s remaining: 0.3s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.6s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.6s finished [featsel] 46 features after 5 feature selection runs [featsel] 37 features after correlation filtering [featsel] 31 features after noise filtering [AutoFeat] Computing 14 new features. [AutoFeat] 14/ 14 new features ...done. [AutoFeat] Final dataframe with 53 feature columns (14 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 4.0935516523734545 13.633462 * PropertyGFATotal -2.317511 * T_LowRiseMultifamily*T_NonResidential 1.971572 * BuildingAge*T_Hospital 1.317148 * T_Supermarket / Grocery Store 1.288570 * BuildingAge*T_SeniorCareCommunity 1.273079 * sqrt(BuildingAge)*T_Laboratory -1.205078 * BuildingAge**3*T_Other -1.177774 * T_Self-Storage Facility 0.877490 * NaturalGasUse*T_Hospital 0.819104 * T_Restaurant 0.782076 * T_Laboratory 0.773348 * SteamUse*T_MixedUseProperty -0.656406 * T_Warehouse -0.613515 * T_Worship Facility -0.596817 * T_Distribution Center 0.543648 * NaturalGasUse*T_NonresidentialCOS 0.530506 * sqrt(BuildingAge)*NaturalGasUse 0.527236 * T_Campus 0.523133 * T_Campus*T_MixedUseProperty -0.450013 * BuildingAge**3*T_Warehouse 0.439839 * T_Other -0.420277 * BuildingAge**3*T_NonResidential 0.247499 * T_NonResidential 0.240155 * T_Medical Office -0.205928 * T_Low-Rise Multifamily 0.177270 * SteamUse -0.175070 * T_Mid-Rise Multifamily 0.163751 * T_Hotel 0.148933 * SteamUse*T_NonResidential 0.134208 * ElectricityUse*T_MixedUseProperty 0.098851 * NaturalGasUse [AutoFeat] Final score: 0.7903 [AutoFeat] Computing 14 new features. [AutoFeat] 14/ 14 new features ...done. [AutoFeat] Computing 14 new features. [AutoFeat] 14/ 14 new features ...done. ## Final R^2: 0.7790 [AutoFeat] Computing 14 new features. [AutoFeat] 14/ 14 new features ...done. ### AutoFeat with 3 feateng_steps [AutoFeat] The 3 step feature engineering process could generate up to 321750 features. [AutoFeat] With 2338 data points this new feature matrix would use about 3.01 gb of space. [feateng] Step 1: transformation of original features [feateng] Generated 16 transformed features from 39 original features - done. [feateng] Step 2: first combination of features [feateng] Generated 5454 feature combinations from 1485 original feature tuples - done. [feateng] Step 3: transformation of new features [feateng] Generated 17058 transformed features from 5454 original features - done. [feateng] Generated altogether 24000 new features in 3 steps [feateng] Removing correlated features, as well as additions at the highest level [feateng] Generated a total of 6312 additional features [featsel] Scaling data...done. [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 11.2s [Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 15.2s remaining: 22.8s [Parallel(n_jobs=-1)]: Done 3 out of 5 | elapsed: 18.7s remaining: 12.5s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.2s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.2s finished [featsel] 51 features after 5 feature selection runs [featsel] 26 features after correlation filtering [featsel] 22 features after noise filtering [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. [AutoFeat] Final dataframe with 60 feature columns (21 new). [AutoFeat] Training final regression model. [AutoFeat] Trained model: largest coefficients: 8.699859208011802 2.849746 * PropertyGFATotal 2.331322 * (-PropertyGFATotal + T_SelfStorageFacility)**2 1.532488 * Abs(LargestPropertyUseTypeGFA**2 - T_Warehouse) -1.423152 * (-LargestPropertyUseTypeGFA + T_WorshipFacility)**3 1.093306 * (PropertyGFATotal - T_DistributionCenter)**3 0.866692 * Abs(T_Campus - T_SupermarketGroceryStore) 0.665241 * Abs(SteamUse - T_SupermarketGroceryStore) 0.540785 * (-PropertyGFATotal**2 + T_MidRiseMultifamily)**2 0.525178 * (-PropertyGFATotal**2 + T_LowRiseMultifamily)**3 0.488354 * Abs(PropertyGFABuildings**2 - T_K12School) -0.469292 * exp(-NaturalGasUse + T_Warehouse) 0.393062 * exp(SteamUse - T_LowRiseMultifamily) 0.332585 * Abs(BuildingAge**2 - T_NonResidential) -0.279207 * exp(NaturalGasUse + SteamUse) 0.182263 * (ElectricityUse + T_NonresidentialCOS)**2 0.168807 * (NaturalGasUse + T_Laboratory)**3 -0.158190 * (ElectricityUse + T_SupermarketGroceryStore)**2 0.108155 * (NaturalGasUse + T_Restaurant)**3 0.103437 * (ElectricityUse + T_Hospital)**3 0.060580 * exp(NaturalGasUse + T_MixedUseProperty) 0.038440 * (ElectricityUse + T_NonResidential)**2 0.027984 * exp(NaturalGasUse + T_Other) [AutoFeat] Final score: 0.7771 [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done. ## Final R^2: 0.7775 [AutoFeat] Computing 21 new features. [AutoFeat] 21/ 21 new features ...done.
Une fois de plus, même sans l'ENERGYSTARScore, l'autofeat n'améliore pas la performance, nous ne l'utilisons pas.
def Test_Sans_ENERGYSTARScore(X_train, X_test, y_train, y_test):
for model in [
RidgeCV(alphas=np.linspace(0.0001, 200, 2000), scoring='r2'),
LassoCV(alphas=np.linspace(0.0001, 10, 10000), max_iter=2000),
ElasticNetCV(l1_ratio=np.linspace(0.0001, 1, 200),
max_iter=2000,
n_alphas=200)
]:
test_model_lineaire_scaled(model, X_train, X_test, y_train, y_test)
xgb1 = XGBRegressor()
parameters = {'nthread': [0], # when use hyperthread, xgboost may become slower
'objective': ['reg:squarederror'],
'booster': ['gbtree'],
'learning_rate': [.03, 0.07, 0.5], # so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [2, 4],
'subsample': [0.3, 0.7],
'colsample_bytree': [0.7],
'n_estimators': [100, 200],
'reg_alpha': [0.01, 0.5],
'reg_lambda': [0.01, 0.5]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv=2,
scoring="r2",
verbose=1)
print_results_xgboost(xgb_grid, X_train, X_test, y_train, y_test)
return
Test_Sans_ENERGYSTARScore(
X_train_energy, X_test_energy, y_train_energy, y_test_energy)
RidgeCV MSE: 0.29815138464248453 RidgeCV RMSE: 0.5308143650079693 RidgeCV alpha 75.83798104052026 Score MSE model sur le jeu d'entrainement 0.2817638900988137 Score RMSE model sur le jeu d'entrainement 0.5308143650079693 Score r2 model sur le jeu d'entrainement 0.7094131338830072
LassoCV MSE: 0.2984842056698674 LassoCV RMSE: 0.5302607864686006 LassoCV alpha 0.0021001800180018002 Score MSE model sur le jeu d'entrainement 0.28117650166629893 Score RMSE model sur le jeu d'entrainement 0.5302607864686006 Score r2 model sur le jeu d'entrainement 0.7137422464177562
ElasticNetCV MSE: 0.29822293848298614 ElasticNetCV RMSE: 0.5306831084832615 ElasticNetCV alpha 0.021835575763053745 Score MSE model sur le jeu d'entrainement 0.28162456162945704 Score RMSE model sur le jeu d'entrainement 0.5306831084832615 Score r2 model sur le jeu d'entrainement 0.7102898560841563
Fitting 2 folds for each of 288 candidates, totalling 576 fits
MSE: 0.2892240478401188
RMSE: 0.42561640521305333
Score MSE model sur le jeu d'entrainement 0.181149324386482
Score RMSE model sur le jeu d'entrainement 0.42561640521305333
Score r2 model sur le jeu d'entrainement 0.8152495855697264
{'booster': 'gbtree', 'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 200, 'nthread': 0, 'objective': 'reg:squarederror', 'reg_alpha': 0.5, 'reg_lambda': 0.01, 'subsample': 0.7}
Après les tests de modèles effectués, nous choississons le modèle Lasso, pour sa performance son bonus de réduction dimensionnelle, qui permet une meilleure interprétabilité.